Air Quality Prediction in Busy Streets
# basics
import os
import requests
from decouple import config
import itertools
import warnings
warnings.filterwarnings('ignore')
# data
import math
import pandas as pd
import numpy as np
import scipy.stats as ss
import zipfile
from collections import defaultdict
from datetime import datetime, timezone, timedelta
import osmium as osm
import urllib.request
import joblib
import wget
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import json
import holidays
# plotting
from matplotlib import pyplot as plt
import seaborn as sns
import chartify
from IPython.display import Image
from IPython.core.display import HTML
# geo spatial
import contextily as ctx
import geopandas as gpd
from pandas_geojson import read_geojson_url
from shapely.geometry import Point
import geocoder
import folium
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import matplotlib.cm as cm # Matplotlib and associated plotting modules
import matplotlib.colors as colors
from opencage.geocoder import OpenCageGeocode
import joblib
import geopandas as gpd
import geopandas as gp
import shapely.speedups
from shapely.geometry import Point, Polygon
# preprocess
from statsmodels.regression.linear_model import OLS
from statsmodels.tsa.stattools import lagmat,pacf,ccf
from statsmodels.tools.tools import add_constant
import ray
# modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import xgboost as xgb
import shap
import statsmodels.api as sm
from statsmodels.tsa.stattools import pacf, acf
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from sklearn import metrics
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans # KMeans for clustering
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import classification_report, mean_squared_error,r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from fbprophet import Prophet
sns.set_style('darkgrid')
API_KEY = config('OPEN_WEATHER_KEY')
# Manually download osm.pbf file (145 MB)
urllib.request.urlretrieve("https://download.geofabrik.de/europe/netherlands/noord-holland-latest.osm.pbf", "./data/noord-holland-latest.osm.pbf")
%%writefile fetch_osm.json
{
"extracts": [
{
"output": "amsterdam.osh.pbf",
"output_format": "osh.pbf",
"description": "extract OSM history for Amsterdam (NL)",
"bbox": {"left": 4.8321,
"right": 4.9575,
"top": 52.3867,
"bottom": 52.3389}
}
],
"directory": "./data/"
}
# Limit the osm file with borders of Amsterdam
!osmium extract --with-history --config=fetch_osm.json ./data/noord-holland-latest.osm.pbf
def get_timeseries_dataframe():
"""
retreive main dutch pollution dataset, and clean it.
https://wdl-data.fra1.digitaloceanspaces.com/unstudio/stadhouderskade_air_quality_2014_to_2022.csv
"""
df = pd.read_csv("https://wdl-data.fra1.digitaloceanspaces.com/unstudio/stadhouderskade_air_quality_2014_to_2022.csv")
df = df.replace(-99,np.nan)
df = df.pivot_table(
index='timestamp_measured'
,columns='component_id'
, values='value'
).rename_axis(columns=None).reset_index()
df['timestamp_measured'] = pd.to_datetime(df['timestamp_measured'])
dt_range = pd.date_range(
start=df['timestamp_measured'].min(),
end=df['timestamp_measured'].max(),
freq="60min"
)
df_out = pd.merge(
pd.Series(dt_range, name='timestamp_measured'),
df,
left_on='timestamp_measured',
right_on='timestamp_measured',
how='left'
)
df_out['timestamp_measured'] = df_out['timestamp_measured'].apply(lambda x:x.tz_localize(None))
return df_out
# -------
def get_historic_weather(lon, lat, start, end, verbose=False):
"""
retreive open weather data, and format to pandas df
https://history.openweathermap.org/data/2.5/history/city?
"""
HISTORY_BASE_URL = 'https://history.openweathermap.org/data/2.5/history/city?'
history_dict = defaultdict(list)
weather_dict = defaultdict(list)
start_dt = datetime.fromisoformat(start)
end_dt = datetime.fromisoformat(end)
while start_dt < end_dt:
start_timestamp = start_dt.replace(tzinfo=timezone.utc).timestamp()
end_timestamp = end_dt.replace(tzinfo=timezone.utc).timestamp()
request_url = "{}lon={}&lat={}&type=hour&start={}&end={}&appid={}".format(
HISTORY_BASE_URL, lon, lat, round(start_timestamp), round(end_timestamp), API_KEY)
r = requests.get(request_url)
historic_weather = r.json()
for day in historic_weather['list']:
history_dict['dt'].append(datetime.utcfromtimestamp(day['dt']).strftime('%Y-%m-%d %H:%M:%S'))
history_dict['temp'].append(day['main']['temp'])
history_dict['feels_like_temp'].append(day['main']['feels_like'])
history_dict['pressure'].append(day['main']['pressure'])
history_dict['humidity'].append(day['main']['humidity'])
history_dict['temp_min'].append(day['main']['temp_min'])
history_dict['temp_max'].append(day['main']['temp_max'])
history_dict['wind_speed'].append(day['wind']['speed'])
history_dict['wind_direction'].append(day['wind']['deg'])
if 'rain' in day.keys():
history_dict['rain_last_hour'].append(day['rain']['1h'])
else:
history_dict['rain_last_hour'].append(0)
if 'snow' in day.keys():
history_dict['snow_last_hour'].append(day['snow']['1h'])
else:
history_dict['snow_last_hour'].append(0)
if 'clouds' in day.keys():
history_dict['clouds'].append(day['clouds']['all'])
else:
history_dict['clouds'].append(0)
for w in day['weather']:
weather_dict['dt'].append(datetime.utcfromtimestamp(day['dt']).strftime('%Y-%m-%d %H:%M:%S'))
weather_dict['id'].append(w['id'])
weather_dict['weather'].append(w['main'])
weather_dict['weather_description'].append(w['description'])
if(verbose):
print(start_dt)
start_dt = start_dt + timedelta(days=7)
history_df = pd.DataFrame.from_dict(history_dict)
weather_df = pd.DataFrame.from_dict(weather_dict)
return history_df, weather_df
def format_time(df,timecol="dt"):
df[timecol] = pd.to_datetime(df[timecol])
# -------
def get_traffic_forecast():
"""
retreive traffic data from main street.
https://api.data.amsterdam.nl/dcatd/datasets/zTHEXVzwMqQI9w/purls/4
"""
fname = "https://api.data.amsterdam.nl/dcatd/datasets/zTHEXVzwMqQI9w/purls/4"
df_traffic = gpd.read_file(fname)
df_compare = pd.merge(
df_traffic[(df_traffic.name=="Stadhouderskade") & (df_traffic.modeljaar=="2015")]
,df_traffic[(df_traffic.name=="Stadhouderskade") & (df_traffic.modeljaar=="2030")]
,on="linknr",suffixes=('_2015', '_2030'))
df_compare["diff"] = df_compare["etmaal_2030"] - df_compare["etmaal_2015"]
df_compare["diff_rel"] = 100*(df_compare["etmaal_2030"] - df_compare["etmaal_2015"])/df_compare["etmaal_2015"]
road_location = gpd.GeoDataFrame(df_compare, geometry="geometry_2015", crs=4326)
road_location = road_location.to_crs(epsg=3857)
# sensor
sensor_location = pd.DataFrame({"longitude":(4.8999651,4.8999651),"latitude":(52.3580345,52.3580345)})
sensor_location = gpd.GeoDataFrame(sensor_location, geometry=gpd.points_from_xy(sensor_location.longitude, sensor_location.latitude), crs=4326)
sensor_location = sensor_location.to_crs(epsg=3857)
# rijksmuseum
rijksmuseum_location = pd.DataFrame({"longitude":(4.8856791,4.8856791),"latitude":( 52.3603291, 52.3603291)})
rijksmuseum_location = gpd.GeoDataFrame(rijksmuseum_location, geometry=gpd.points_from_xy(rijksmuseum_location.longitude, rijksmuseum_location.latitude), crs=4326).to_crs(epsg=3857)
return(road_location,sensor_location,rijksmuseum_location)
# -------
def get_and_process_survey_data():
"""
Retreive survey data, that our team: EVERYTHING IS AWESOME,
created and spread to Dutch friends, facebook groups & the green mile mailing list
with the help of UNStudio
https://docs.google.com/spreadsheets/d/e/2PACX-1vQ8iuitntEOP9TR1c5E4vT5cMrtYVPsiLTWx-gjPXTmnxd73uNCnwPjCh8k-xXFa1FGSlN3zRWLhNES/pub?output=csv
And demographic data aquried from citypolution
https://www.citypopulation.de/en/netherlands/admin/noord_holland/0363__amsterdam/
"""
page = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQ8iuitntEOP9TR1c5E4vT5cMrtYVPsiLTWx-gjPXTmnxd73uNCnwPjCh8k-xXFa1FGSlN3zRWLhNES/pub?output=csv"
survey = pd.read_csv(page)
survey.columns = ['ts','age','gender','favourite_place','top3','winter_vs_summer','top3_stadhouderskade','other']
# extract top3 features
a = survey["top3"].apply(lambda x: x.split(", ")[0])
b = survey["top3"].apply(lambda x: x.split(", ")[1] if len(x.split(","))==3 else x.split(",")[0] )
c = survey["top3"].apply(lambda x: x.split(", ")[-1])
survey_top3 = pd.concat([
pd.merge(survey[["age","gender"]],a,left_index=True, right_index=True)
,pd.merge(survey[["age","gender"]],b,left_index=True, right_index=True)
,pd.merge(survey[["age","gender"]],c,left_index=True, right_index=True)
])
# Get demographic info
#source: https://www.citypopulation.de/en/netherlands/admin/noord_holland/0363__amsterdam/
df_demo = pd.DataFrame(data={
"0-9 years": 86460
,"10-19 years": 82753
,"20-30 years": 167958
,"31-40 years": 164893
,"41-50 years": 114707
,"51-60 years": 111891
,"61-70 years": 86059
,"71-80 years": 53898
,"81-90 years": 21044
,"90+ years": 4120
},index=[0]).T.reset_index()
df_demo.columns=["age","count"]
df_demo["percent"] = 100*df_demo["count"]/df_demo['count'].sum()
df_demo["type"] = "demography"
df_demo_s = pd.DataFrame(survey_top3[survey_top3["age"]!="Will not share"].groupby(["age"]).size()).reset_index().rename(columns = {0:'count'})
df_demo_s["percent"] = 100*df_demo_s["count"]/df_demo_s['count'].sum()
df_demo_s["type"] = "survey"
df_demo_super = pd.concat([df_demo,df_demo_s]).groupby(["age","type"]).sum().reset_index()
# correct the main data based on the demographics
df_scale = pd.merge(df_demo_s,df_demo,how="left",on="age",suffixes=('','_pop'))
df_scale["count_norm"] = df_scale["percent_pop"]/(df_scale["percent"])
df_scale["count"] = 1
df_compare = pd.merge(survey_top3,df_scale[["age","count","count_norm"]],on="age",suffixes=('','_s'))
df_compare = df_compare.groupby("top3").sum().reset_index()
def pd_percent(df,col):
return 100*df[col]/df[col].sum()
df_compare["survey"] = pd_percent(df_compare,"count")
df_compare["survey_normed"] = pd_percent(df_compare,"count_norm")
df_c = df_compare[["top3","survey"]].copy()
df_c["type"] = "survey_raw"
df_c_norm = df_compare[["top3","survey_normed"]].copy()
df_c_norm["type"] = "normed"
df_c_norm.columns=df_c.columns
df_compare_plot = pd.concat([df_c,df_c_norm])
return (survey,df_demo_super,df_compare_plot)
# ---
def get_amsterdam_osm_tags():
class TagGenomeHandler(osm.SimpleHandler):
def __init__(self):
osm.SimpleHandler.__init__(self)
self.taggenome = []
def tag_inventory(self, elem, elem_type):
for tag in elem.tags:
self.taggenome.append([elem_type,
elem.id,
elem.location,
elem.version,
tag.k,
tag.v])
def node(self, n):
self.tag_inventory(n, "node")
taghandler = TagGenomeHandler()
taghandler.apply_file("./data/amsterdam.osh.pbf")
colnames = ['type', 'id', 'location','version', 'tagkey', 'tagvalue']
tags = pd.DataFrame(taghandler.taggenome, columns=colnames)
tags['lon'] = tags['location'].apply(lambda x: str(x).split('/')[0])
tags['lat'] = tags['location'].apply(lambda x: str(x).split('/')[1])
tags = gpd.GeoDataFrame(
tags, geometry=gpd.points_from_xy(tags.lon, tags.lat))
return tags.set_crs('EPSG:4326')
def load_pedestrian_data():
fname = "https://api.data.amsterdam.nl/v1/wfs/crowdmonitor/?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&TYPENAMES=passanten&OUTPUTFORMAT=geojson"
df_crowd = gpd.read_file(fname,low_memory=False)
df_crowd['geometry'] = df_crowd['geometry'].to_crs(4326)
important_sensors = ['Korte Niezel','Oudekennissteeg','Stoofsteeg','Oudezijds Voorburgwal t.h.v. 91','Oudezijds Achterburgwal t.h.v. 86','Oudezijds Achterburgwal t.h.v. 91','Oudezijds Voorburgwal t.h.v. 206','Molensteeg','Oudebrugsteeg','Damstraat','Kloveniersburgwal','Bloedstraat','Nieuwebrugsteeg','Oudezijds Voorburgwal t.h.v. 107','Oudezijds Achterburgwal t.h.v. 116','Stormsteeg t.h.v. 3','Stadhouderskade','Museumplein Noord','Museumplein Zuid','Amstelveenseweg','Kalverstraat t.h.v. 1','Kalverstraat Zuid t.h.v.73','Nieuwendijk t.h.v.218','Rokin 66','Damrak 1-5','Dam-total','Dam-zone','Albert Cuypstraat']
pedestrian_df = df_crowd.groupby('naam_locatie').first()[['geometry']].reset_index()
pedestrian_df = pedestrian_df[pedestrian_df['naam_locatie'].isin(important_sensors)]
return pedestrian_df.set_crs('EPSG:4326')
def calculate_osm_features(pedestrian_df, tags):
interesting_tags = [{'tagkey':'leisure', 'tagvalue':'playground'},{'tagkey':'leisure', 'tagvalue':'sports_centre'},{'tagkey':'leisure', 'tagvalue':'picnic_table'},{'tagkey':'bus', 'tagvalue':'yes'},{'tagkey':'subway', 'tagvalue':'yes'},{'tagkey':'tram', 'tagvalue':'yes'},{'tagkey':'amenity', 'tagvalue':'restaurant'},{'tagkey':'amenity', 'tagvalue':'bench'},{'tagkey':'amenity', 'tagvalue':'cafe'},{'tagkey':'amenity', 'tagvalue':'fast_food'},{'tagkey':'amenity', 'tagvalue':'bicycle_parking'},{'tagkey':'amenity', 'tagvalue':'pub'},{'tagkey':'amenity', 'tagvalue':'drinking_water'},{'tagkey':'amenity', 'tagvalue':'waste_basket'},{'tagkey':'amenity', 'tagvalue':'charging_station'},{'tagkey':'amenity', 'tagvalue':'post_box'},{'tagkey':'amenity', 'tagvalue':'bar'},{'tagkey':'amenity', 'tagvalue':'brothel'},{'tagkey':'amenity', 'tagvalue':'atm'},{'tagkey':'amenity', 'tagvalue':'parking'},{'tagkey':'amenity', 'tagvalue':'toilets'},{'tagkey':'amenity', 'tagvalue':'school'},{'tagkey':'amenity', 'tagvalue':'community_centre'},{'tagkey':'tourism', 'tagvalue':'artwork'},{'tagkey':'tourism', 'tagvalue':'hotel'},{'tagkey':'tourism', 'tagvalue':'information'},{'tagkey':'tourism', 'tagvalue':'museum'},{'tagkey':'tourism', 'tagvalue':'attraction'},{'tagkey':'tourism', 'tagvalue':'picnic_site'},{'tagkey':'tourism', 'tagvalue':'hostel'},{'tagkey':'tourism', 'tagvalue':'viewpoint'},{'tagkey':'tourism', 'tagvalue':'guest_house'},{'tagkey':'tourism', 'tagvalue':'gallery'}, {'tagkey':'shop', 'tagvalue':'clothes'},{'tagkey':'shop', 'tagvalue':'hairdresser'},{'tagkey':'shop', 'tagvalue':'supermarket'},{'tagkey':'shop', 'tagvalue':'gift'},{'tagkey':'shop', 'tagvalue':'bicycle'},{'tagkey':'shop', 'tagvalue':'bakery'},{'tagkey':'shop', 'tagvalue':'convenience'}]
def count_tag_distance(df1, osm_df, tag_value='restaurant', distance=100, tag_key='amenity'):
df2 = osm_df[(osm_df['tagkey']==tag_key) & (osm_df['tagvalue']==tag_value)].reset_index()
return df1.to_crs('EPSG:3857').geometry.apply(lambda g: df2.to_crs('EPSG:3857').geometry.distance(g)).apply(lambda x: x<distance, axis=1).sum(axis=1)
def min_distance_tag(df1, osm_df, tag_value='restaurant', distance=100, tag_key='amenity'):
df2 = osm_df[(osm_df['tagkey']==tag_key) & (osm_df['tagvalue']==tag_value)].reset_index()
return df1.to_crs('EPSG:3857').geometry.apply(lambda g: df2.to_crs('EPSG:3857').geometry.distance(g)).min(axis=1)
for tag in interesting_tags:
pedestrian_df[f"{tag['tagkey']}_{tag['tagvalue']}_count_100m"] = count_tag_distance(pedestrian_df, tags, tag_value=tag['tagvalue'], distance=100, tag_key=tag['tagkey'])
pedestrian_df[f"{tag['tagkey']}_{tag['tagvalue']}_count_200m"] = count_tag_distance(pedestrian_df, tags, tag_value=tag['tagvalue'], distance=200, tag_key=tag['tagkey'])
pedestrian_df[f"{tag['tagkey']}_{tag['tagvalue']}_count_300m"] = count_tag_distance(pedestrian_df, tags, tag_value=tag['tagvalue'], distance=300, tag_key=tag['tagkey'])
pedestrian_df[f"{tag['tagkey']}_{tag['tagvalue']}_min_distance"] = min_distance_tag(pedestrian_df, tags, tag_value=tag['tagvalue'], distance=100, tag_key=tag['tagkey'])
return pedestrian_df
def get_interesting_tags(tags):
interesting_tags = [{'tagkey':'leisure', 'tagvalue':'playground'},{'tagkey':'leisure', 'tagvalue':'sports_centre'},{'tagkey':'leisure', 'tagvalue':'picnic_table'},{'tagkey':'bus', 'tagvalue':'yes'},{'tagkey':'subway', 'tagvalue':'yes'},{'tagkey':'tram', 'tagvalue':'yes'},{'tagkey':'amenity', 'tagvalue':'restaurant'},{'tagkey':'amenity', 'tagvalue':'bench'},{'tagkey':'amenity', 'tagvalue':'cafe'},{'tagkey':'amenity', 'tagvalue':'fast_food'},{'tagkey':'amenity', 'tagvalue':'bicycle_parking'},{'tagkey':'amenity', 'tagvalue':'pub'},{'tagkey':'amenity', 'tagvalue':'drinking_water'},{'tagkey':'amenity', 'tagvalue':'waste_basket'},{'tagkey':'amenity', 'tagvalue':'charging_station'},{'tagkey':'amenity', 'tagvalue':'post_box'},{'tagkey':'amenity', 'tagvalue':'bar'},{'tagkey':'amenity', 'tagvalue':'brothel'},{'tagkey':'amenity', 'tagvalue':'atm'},{'tagkey':'amenity', 'tagvalue':'parking'},{'tagkey':'amenity', 'tagvalue':'toilets'},{'tagkey':'amenity', 'tagvalue':'school'},{'tagkey':'amenity', 'tagvalue':'community_centre'},{'tagkey':'tourism', 'tagvalue':'artwork'},{'tagkey':'tourism', 'tagvalue':'hotel'},{'tagkey':'tourism', 'tagvalue':'information'},{'tagkey':'tourism', 'tagvalue':'museum'},{'tagkey':'tourism', 'tagvalue':'attraction'},{'tagkey':'tourism', 'tagvalue':'picnic_site'},{'tagkey':'tourism', 'tagvalue':'hostel'},{'tagkey':'tourism', 'tagvalue':'viewpoint'},{'tagkey':'tourism', 'tagvalue':'guest_house'},{'tagkey':'tourism', 'tagvalue':'gallery'}, {'tagkey':'shop', 'tagvalue':'clothes'},{'tagkey':'shop', 'tagvalue':'hairdresser'},{'tagkey':'shop', 'tagvalue':'supermarket'},{'tagkey':'shop', 'tagvalue':'gift'},{'tagkey':'shop', 'tagvalue':'bicycle'},{'tagkey':'shop', 'tagvalue':'bakery'},{'tagkey':'shop', 'tagvalue':'convenience'}]
return tags[eval('|'.join([f"((tags['tagvalue']=='{tag['tagvalue']}') & (tags['tagkey']=='{tag['tagkey']}'))" for tag in interesting_tags]))][['tagkey','tagvalue','geometry']]
def calculate_features_raw():
tags = get_amsterdam_osm_tags()
pedestrian_df = load_pedestrian_data()
pedestrian_df = calculate_osm_features(pedestrian_df, tags)
st_df = pd.DataFrame([{'naam_locatie': 'stadhouderskade', 'lon':4.881055, 'lat': 52.361385}])
stadhouderskade_df = gpd.GeoDataFrame(data=st_df, geometry=gpd.points_from_xy(st_df.lon, st_df.lat)).set_crs('EPSG:4326')
stadhouderskade_df = calculate_osm_features(stadhouderskade_df, tags)
raw_tags= get_interesting_tags(tags)
joblib.dump(pedestrian_df, './data/osm_features.pkl')
joblib.dump(pedestrian_df, './data/stadhouderskade_osm_features.pkl')
joblib.dump(raw_tags, './data/amenities_tags.pkl')
return tags, pedestrian_df, stadhouderskade_df, raw_tags
def pccf_ols(x,y, nlags=40,use_ray=False):
'''Calculate partial crosscorrelations
Parameters
----------
x : 1d array
observations of time series for which pccf is calculated
y : 1d array
observations of time series for which pccf is calculated
nlags : int
Number of lags for which is returned (it is x that is moved)
Returns
-------
pacf : 1d array
partial crosscorrelations, maxlag+1 elements
Notes
-----
This solves a separate OLS estimation for each desired lag.
'''
if use_ray:
xlags, _ = lagmat(x, nlags, original='sep')
_, y0 = lagmat(y, nlags, original='sep')
xlags_ref = ray.put(xlags)
y0_ref = ray.put(y0)
#xlags = sm.add_constant(lagmat(x, nlags), prepend=True)
xlags = add_constant(xlags)
pccf_ref = []
@ray.remote
def ppcf(k):
res = OLS(ray.get(y0_ref)[k:], ray.get(xlags_ref)[k:, :k+1]).fit()
return res.params[-1]
for k in range(0, nlags+1):
pccf_ref.append(ppcf.remote(k))
pccf = ray.get(pccf_ref)
else:
xlags, _ = lagmat(x, nlags, original='sep')
_, y0 = lagmat(y, nlags, original='sep')
xlags = add_constant(xlags)
pccf = []
for k in range(0, nlags+1):
res = OLS(y0[k:], xlags[k:, :k+1]).fit()
pccf.append(res.params[-1])
return pccf
# ----
def analyse_for_lags(df,target,max_lag,threshold=0.2,pccf=False,**kwargs):
"""
return the max lags correlated with target limited by maximum and correlation threshold
"""
lag_dict = {}
# autocorrelation
partial = pd.Series(data=pacf(df_joined[target].fillna(0), nlags=max_lag))
lag_dict[target] = list(partial[1:][partial[1:]>threshold].index)
# cross correlation
numerics = ['uint8','int16', 'int32', 'int64', 'float64']
for c in df.select_dtypes(include=numerics).columns:
if (c==target):
continue
if pccf:
partial_cross_corr = pd.Series(data=pccf_ols(df_joined[target].fillna(0),
df_joined[c].fillna(0),
nlags=max_lag, **kwargs
))
lag_dict[c] = list(partial_cross_corr[1:max_lag][partial_cross_corr[1:max_lag]>threshold].index)
else:
cross_corr = pd.Series(data=ccf(df_joined[target].fillna(0),
df_joined[c].fillna(0)
,adjusted=True
, fft=True
))
lag_dict[c] = list(cross_corr[1:max_lag][cross_corr[1:max_lag]>threshold].index)
return lag_dict
# ----
def add_lag_features(df,lags_dict,min_lags=0):
"""
create lag columns in a data frame, df, based on a dictionary with columns as key and list of desired lags
"""
for c, lags_list in lags_dict.items():
if len(lags_list)>0:
for lag in lags_list:
df[c + "_lag"+str(lag)] = df[c].shift(-lag)
else:
for i in range(min_lags):
lag = i+1
df[c + "_lag"+str(lag)] = df[c].shift(-lag)
return df
# ---
def train_xgb_regressor(df, lr=0.01, max_depth=5,target="target"):
df_train, df_test = train_test_split(df, test_size=0.3)
model = xgb.XGBRegressor(learning_rate=lr, max_depth=max_depth, n_jobs=4,)
model.fit(df_train.drop(labels=[target], axis = 1), df_train[target])
return model, df_train, df_test
def plot_feature_graphs(model, X, plot_all=True,max_display=7):
#xgb.plot_importance(model)
plt.show()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X,max_display=max_display)
shap.summary_plot(shap_values, X, plot_type='bar',max_display=max_display)
if plot_all:
for c in X.columns:
shap.dependence_plot(c,shap_values,X)
# --- Clustering
def prepare_and_cluster():
df_amsterdam = pd.read_html('https://en.wikipedia.org/wiki/Boroughs_of_Amsterdam')[1]
df_amsterdam_renamed = df_amsterdam.rename(columns={"Location (in green)": "Location"}).drop(columns = ["Location", "Population density", "Area"])
df_Neighborhoods = pd.DataFrame({"Neighbourhoods": df_amsterdam_renamed['Neighbourhoods']})
Neigh=[]
for key in df_Neighborhoods.Neighbourhoods:
Neigh.append(key)
def extractDigits(lst):
res = []
for el in lst:
sub = el.split(',')
res=res+sub
return(res)
Neigh_extented = extractDigits(Neigh)
df_Neigh_extented = pd.DataFrame(Neigh_extented, columns = ['Neighbourhoods'])
geocoder = OpenCageGeocode("ac45248e086f492295223df2df9c4a31") # get api key from: https://opencagedata.com
query = 'Amsterdam, NL'
results = geocoder.geocode(query)
lat = results[0]['geometry']['lat']
lng = results[0]['geometry']['lng']
list_lat = [] # create empty lists
list_long = []
for Neighbourhood in df_Neigh_extented['Neighbourhoods']: # iterate over rows in dataframe
results = geocoder.geocode('{} amsterdam'.format(Neighbourhood))
lat = results[0]['geometry']['lat']
long = results[0]['geometry']['lng']
list_lat.append(lat)
list_long.append(long)
# create new columns from lists
df_Neigh_extented['lat'] = list_lat
df_Neigh_extented['lon'] = list_long
address = 'Amsterdam'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
amenities = joblib.load('amenities_tags_2.pkl').reset_index().drop(columns = "index")
amenities['lat'] = amenities.geometry.y
amenities['lon'] = amenities.geometry.x
amenities.loc[amenities['tagkey'] == "tram", 'tagvalue'] = "tram"
amenities.loc[amenities['tagkey'] == "bus", 'tagvalue'] = "bus"
amenities.loc[amenities['tagkey'] == "subway", 'tagvalue'] = "subway"
fname = "https://maps.amsterdam.nl/open_geodata/geojson_lnglat.php?KAARTLAAG=GEBIEDEN22&THEMA=gebiedsindeling"
df_borough = gpd.read_file(fname,low_memory=False)
df_borough = df_borough.drop(columns = ["Gebied_code", "Stadsdeel_code","Opp_m2"])
df_borough = df_borough.rename(columns={"Gebied": "borough_name"})
df_borough['lat'] = df_borough.centroid.y
df_borough['lon'] = df_borough.centroid.x
df_borough = df_borough[df_borough["borough_name"] != "Westpoort"]
# Create dataset with lat and long for different boroughs¶
d = {'borough': ["Centrum", "Noord", "Nieuw-West", "Oost", "West", "Zuid", "Zuidoost"],
'population': [86422, 94766, 151677, 135767, 143842, 144432,87854],
'latitude': [52.369985, 52.391111, 52.363742, 52.352778, 52.383025, 52.346389, 52.310556],
'longitude': [4.898014, 4.918306, 4.806862, 4.930556, 4.852867, 4.858611, 4.973333]}
df_new = pd.DataFrame(data=d)
# speedups.enable()
## Create dataset with lat and long for different boroughs
df_new['point'] = df_new.apply(lambda x: Point(x['longitude'], x['latitude']), axis=1)
df_areas = gpd.GeoDataFrame(df_new, geometry=gpd.points_from_xy(df_new.longitude, df_new.latitude))
df_borough['area'] = np.zeros(len(df_borough))
df_borough.loc[df_borough['borough_name'] == "Bijlmer-Oost", 'area'] = "Zuidoost"
df_borough.loc[df_borough['borough_name'] == "Geuzenveld, Slotermeer, Sloterdijken", 'area'] = "Nieuw-West"
df_borough.loc[df_borough['borough_name'] == "Osdorp", 'area'] = "Nieuw-West"
df_borough.loc[df_borough['borough_name'] == "De Aker, Sloten, Nieuw-Sloten", 'area'] = "Nieuw-West"
df_borough.loc[df_borough['borough_name'] == "Slotervaart", 'area'] = "Nieuw-West"
df_borough.loc[df_borough['borough_name'] == "Bos en Lommer", 'area'] = "West"
df_borough.loc[df_borough['borough_name'] == "Noord-West", 'area'] = "Noord"
df_borough.loc[df_borough['borough_name'] == "Westerpark", 'area'] = "West"
df_borough.loc[df_borough['borough_name'] == "Oud West, De Baarsjes", 'area'] = "West"
df_borough.loc[df_borough['borough_name'] == "Oud-Zuid", 'area'] = "Zuid"
df_borough.loc[df_borough['borough_name'] == "Buitenveldert, Zuidas", 'area'] = "Zuid"
df_borough.loc[df_borough['borough_name'] == "Centrum-West", 'area'] = "Centrum"
df_borough.loc[df_borough['borough_name'] == "Centrum-Oost", 'area'] = "Centrum"
df_borough.loc[df_borough['borough_name'] == "Oud-Noord", 'area'] = "Noord"
df_borough.loc[df_borough['borough_name'] == "Noord-Oost", 'area'] = "Noord"
df_borough.loc[df_borough['borough_name'] == "IJburg, Zeeburgereiland", 'area'] = "Oost"
df_borough.loc[df_borough['borough_name'] == "Indische Buurt, Oostelijk Havengebied", 'area'] = "Oost"
df_borough.loc[df_borough['borough_name'] == "Oud-Oost", 'area'] = "Oost"
df_borough.loc[df_borough['borough_name'] == "De Pijp, Rivierenbuurt", 'area'] = "Zuid"
df_borough.loc[df_borough['borough_name'] == "Watergraafsmeer", 'area'] = "Oost"
df_borough.loc[df_borough['borough_name'] == "Bijlmer-Centrum, Amstel III", 'area'] = "Zuidoost"
df_borough.loc[df_borough['borough_name'] == "Gaasperdam, Driemond", 'area'] = "Zuidoost"
#population data from https://onderzoek.amsterdam.nl/interactief/dashboard-kerncijfers
df_borough["population"] = np.zeros(len(df_borough))
df_borough.loc[df_borough['borough_name'] == "Bijlmer-Oost", 'population'] = 29877
df_borough.loc[df_borough['borough_name'] == "Geuzenveld, Slotermeer, Sloterdijken", 'population'] = 46697
df_borough.loc[df_borough['borough_name'] == "Osdorp", 'population'] = 40298
df_borough.loc[df_borough['borough_name'] == "De Aker, Sloten, Nieuw-Sloten", 'population'] = 28982
df_borough.loc[df_borough['borough_name'] == "Slotervaart", 'population'] = 45411
df_borough.loc[df_borough['borough_name'] == "Bos en Lommer", 'population'] = 36618
df_borough.loc[df_borough['borough_name'] == "Noord-West", 'population'] = 39096
df_borough.loc[df_borough['borough_name'] == "Westerpark", 'population'] = 38805
df_borough.loc[df_borough['borough_name'] == "Oud West, De Baarsjes", 'population'] = 73485
df_borough.loc[df_borough['borough_name'] == "Oud-Zuid", 'population'] = 54400
df_borough.loc[df_borough['borough_name'] == "Buitenveldert, Zuidas", 'population'] = 27472
df_borough.loc[df_borough['borough_name'] == "Centrum-West", 'population'] = 44009
df_borough.loc[df_borough['borough_name'] == "Centrum-Oost", 'population'] = 43970
df_borough.loc[df_borough['borough_name'] == "Oud-Noord", 'population'] = 31878
df_borough.loc[df_borough['borough_name'] == "Noord-Oost", 'population'] = 3219029474
df_borough.loc[df_borough['borough_name'] == "Indische Buurt, Oostelijk Havengebied", 'population'] = 41750
df_borough.loc[df_borough['borough_name'] == "Oud-Oost", 'population'] = 35566
df_borough.loc[df_borough['borough_name'] == "De Pijp, Rivierenbuurt", 'population'] = 63329
df_borough.loc[df_borough['borough_name'] == "Watergraafsmeer", 'population'] = 36291
df_borough.loc[df_borough['borough_name'] == "Bijlmer-Centrum, Amstel III", 'population'] = 27018
df_borough.loc[df_borough['borough_name'] == "Gaasperdam, Driemond", 'population'] = 33728
def get_area_name(pnt:Point):
for i,j in enumerate(df_borough.geometry):
if pnt.within(j):
return df_borough["borough_name"].iloc[i]
get_area_name = np.vectorize(get_area_name)
amenities['borough_name'] = pd.Series(get_area_name(amenities.geometry.values))
amenities = amenities.rename(columns={"lat": "lat_place", "lon": "lon_place", "geometry": "geometry_place"})
amenities_all = amenities.merge(df_borough, how='inner', on='borough_name')
amenities_all = amenities_all.rename(columns={"lat": "lat_borough", "lon": "lon_borough", "geometry": "geometry_borough"})
# one hot encoding of the Venue Category
amsterdam_onehot = pd.get_dummies(amenities_all[['tagvalue']], prefix="", prefix_sep="")
# add borough column back to dataframe
amsterdam_onehot['borough_name'] = amenities_all['borough_name']
# move borough column to the first column
fixed_columns = [amsterdam_onehot.columns[-1]] + list(amsterdam_onehot.columns[:-1])
amsterdam_onehot = amsterdam_onehot[fixed_columns]
# Optional - save the encoded df
amsterdam_onehot.to_csv('amsterdam_onehot.csv', index=False)
amsterdam_grouped = amsterdam_onehot.groupby('borough_name').mean().reset_index()
def return_most_common_venues(row, num_top_venues):
row_categories = row.iloc[1:]
row_categories_sorted = row_categories.sort_values(ascending=False)
return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues = 40
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['borough_name']
for ind in np.arange(num_top_venues):
try:
columns.append('{}{} Most Common Place'.format(ind+1, indicators[ind]))
except:
columns.append('{}th Most Common Place'.format(ind+1))
# create a new dataframe
boroughs_venues_sorted = pd.DataFrame(columns=columns)
boroughs_venues_sorted['borough_name'] = amsterdam_grouped['borough_name']
for ind in np.arange(amsterdam_grouped.shape[0]):
boroughs_venues_sorted.iloc[ind, 1:] = return_most_common_venues(amsterdam_grouped.iloc[ind, :], num_top_venues)
# preparing the data for clustering - dropping the Borough column as it is not required
amsterdam_grouped_clustering = amsterdam_grouped.drop('borough_name', 1)
# set the number of clusters
kclusters = 5
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, n_init=50, max_iter=600, tol=0.0001, random_state=0)
kmeans.fit(amsterdam_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20]
df_new = df_new.rename(columns={"borough": "area"})
# add clustering labels
boroughs_venues_sorted.insert(0, 'cluster_labels', kmeans.labels_)
amsterdam_merged = df_borough
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
amsterdam_merged = amsterdam_merged.join(boroughs_venues_sorted.set_index('borough_name'), on='borough_name')
#dropna because we have data for neighborhoods for which we have pedestrian data
amsterdam_merged_new= amsterdam_merged.dropna()
amsterdam_merged_new["cluster_labels"] = amsterdam_merged_new["cluster_labels"].astype(str)
colordict_2 = {"0.0": 'blue', "1.0": 'green', "2.0": 'orange', "3.0":"yellow", "4.0":"red"}
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11, tiles ="cartodb positron")
for _, r in df_borough.iterrows():
# Without simplifying the representation of each borough,
# the map might not be displayed
sim_geo = gpd.GeoSeries(r['geometry']).simplify(tolerance=0.001)
geo_j = sim_geo.to_json()
geo_j = folium.GeoJson(data=df_borough,
style_function=lambda x: {'color':'gray','fillColor': 'lightgreen',"opacity":1,
'fillOpacity': 0,
'interactive':False})
#folium.Popup(r['borough_name']).add_to(geo_j)
geo_j.add_to(map_clusters)
# add markers to the map
markers_colors = []
for lat, lon, borough, cluster in zip(amsterdam_merged_new['lat'], amsterdam_merged_new['lon'], amsterdam_merged_new['borough_name'], amsterdam_merged_new['cluster_labels']):
folium.CircleMarker(
[lat, lon],
radius=10,
popup=('Borough: ' + str(borough).capitalize() + '<br>'
'Cluster #: ' + str(cluster) + '<br>'),
color='b',
key_on = cluster,
threshold_scale=[0,1,2],
fill_color=colordict_2[cluster],
fill=True,
fill_opacity=0.9).add_to(map_clusters)
num_top_venues = 40
cluster_0 = [] # Blue
cluster_3 = [] # Yellow (City center)
cluster_4 = [] # Red
for place, cluster in zip(amsterdam_merged_new['borough_name'], amsterdam_merged_new['cluster_labels']):
#print("----"+place+"----")
temp = amsterdam_grouped[amsterdam_grouped['borough_name'] == place].T.reset_index()
temp.columns = ['place','freq']
temp = temp.iloc[1:]
# print(temp)
temp['freq'] = temp['freq'].astype(float)
temp = temp.round({'freq': 4})
#print(cluster)
if cluster == '0.0':
cluster_0.append(temp)
if cluster == '3.0':
cluster_3.append(temp)
if cluster == '4.0':
cluster_4.append(temp)
#print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
#print('\n')
from functools import reduce
cluster_3 = reduce(lambda left,right: pd.merge(left,right,on='place'), cluster_3)
cluster_0 = reduce(lambda left,right: pd.merge(left,right,on='place'), cluster_0)
cluster_4 = reduce(lambda left,right: pd.merge(left,right,on='place'), cluster_4)
c_types = pd.DataFrame({
'blue': cluster_0.set_index('place').sum(axis=1),
'yellow':cluster_3.set_index('place').sum(axis=1),
'red':cluster_4.set_index('place').sum(axis=1),
'diff':cluster_3.set_index('place').sum(axis=1)- cluster_0.set_index('place').sum(axis=1)}
).sort_values('diff').head(5)
c_types.index.name=None
c_types.loc[c_types.index=='clothes'].index
#amenities.loc[amenities['tagkey'] == "tram", 'tagvalue'] = "tram"
as_list = c_types.index.tolist()
idx = as_list.index('clothes')
as_list[idx] = 'clothing store'
c_types.index = as_list
c_types[['blue','yellow','red']]
return map_clusters, c_types
# --- Model analsysis
def cramers_v(x, y):
confusion_matrix = pd.crosstab(x,y)
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r-((r-1)**2)/(n-1)
kcorr = k-((k-1)**2)/(n-1)
return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
def correlation_ratio(categories, measurements):
fcat, _ = pd.factorize(categories)
cat_num = np.max(fcat)+1
y_avg_array = np.zeros(cat_num)
n_array = np.zeros(cat_num)
for i in range(0,cat_num):
cat_measures = measurements[np.argwhere(fcat == i).flatten()]
n_array[i] = len(cat_measures)
y_avg_array[i] = np.average(cat_measures)
y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
if numerator == 0:
eta = 0.0
else:
eta = numerator/denominator
return eta
def associations(dataset, nominal_columns=None, mark_columns=False, theil_u=False, plot=True,
return_results = False, **kwargs):
"""
Calculate the correlation/strength-of-association of features in data-set with both categorical (eda_tools) and
continuous features using:
- Pearson's R for continuous-continuous cases
- Correlation Ratio for categorical-continuous cases
- Cramer's V or Theil's U for categorical-categorical cases
:param dataset: NumPy ndarray / Pandas DataFrame
The data-set for which the features' correlation is computed
:param nominal_columns: string / list / NumPy ndarray
Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all
columns are categorical, or None (default) to state none are categorical
:param mark_columns: Boolean (default: False)
if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on there type (eda_tools or
continuous), as provided by nominal_columns
:param plot: Boolean (default: True)
If True, plot a heat-map of the correlation matrix
:param return_results: Boolean (default: False)
If True, the function will return a Pandas DataFrame of the computed associations
:param kwargs:
Arguments to be passed to used function and methods
:return: Pandas DataFrame
A DataFrame of the correlation/strength-of-association between all features
"""
columns = dataset.columns
if nominal_columns is None:
nominal_columns = list()
elif nominal_columns == 'all':
nominal_columns = columns
corr = pd.DataFrame(index=columns, columns=columns)
for i in range(0,len(columns)):
for j in range(i,len(columns)):
if i == j:
corr[columns[i]][columns[j]] = 1.0
else:
if columns[i] in nominal_columns:
if columns[j] in nominal_columns:
cell = cramers_v(dataset[columns[i]],dataset[columns[j]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
else:
cell = correlation_ratio(dataset[columns[i]], dataset[columns[j]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
else:
if columns[j] in nominal_columns:
cell = correlation_ratio(dataset[columns[j]], dataset[columns[i]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
else:
cell, _ = ss.pearsonr(dataset[columns[i]], dataset[columns[j]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
corr.fillna(value=np.nan, inplace=True)
if mark_columns:
marked_columns = ['{} (nom)'.format(col) if col in nominal_columns else '{} (con)'.format(col) for col in columns]
corr.columns = marked_columns
corr.index = marked_columns
if plot:
plt.figure(figsize=(30,30))
sns.heatmap(corr, annot=kwargs.get('annot',True), fmt=kwargs.get('fmt','.3f'), cmap='coolwarm')
plt.show()
if return_results:
return corr
# --- Data prep
def prep_weather(historic_data,weather_data):
"""
Merge and add time features to weather data
"""
historic_data_clean = historic_data.drop_duplicates(keep = "first").reset_index().drop(columns = "index")
historic_data_6 = historic_data_clean[(historic_data_clean.dt >= "2021-03-22 22:00:00+00:00") & (historic_data_clean.dt <= "2021-09-30 11:00:00+00:00")]
weather_data_clean = weather_data.drop_duplicates(keep = "first").reset_index().drop(columns = "index")
weather_data_6 = weather_data_clean[(weather_data_clean.dt >= "2021-03-22 22:00:00+00:00") & (weather_data_clean.dt <= "2021-09-30 11:00:00+00:00")]
df_weather_compl = pd.merge(historic_data_6, weather_data_6, on=["dt"], how='inner')
df_weather_compl["dt"] = pd.to_datetime(df_weather_compl["dt"])
df_weather_compl = df_weather_compl.set_index('dt')
df_weather_compl['year'] = df_weather_compl.index.year
df_weather_compl['month'] = df_weather_compl.index.month
df_weather_compl['hour'] = df_weather_compl.index.hour
df_weather_compl['weekday_name'] = df_weather_compl.index.day_name()
df_weather_compl['day'] = df_weather_compl.index.day
df_weather = df_weather_compl.reset_index().drop(columns = ["dt", "id"])
return df_weather
def prep_pedestrians(df_weather):
"""
Extract crowdmonitor data
https://api.data.amsterdam.nl/v1/wfs/crowdmonitor/?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&TYPENAMES=passanten&OUTPUTFORMAT=geojson
and combine with pedestrian data and weather
"""
fname = "https://api.data.amsterdam.nl/v1/wfs/crowdmonitor/?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&TYPENAMES=passanten&OUTPUTFORMAT=geojson"
df_crowd = gpd.read_file(fname,low_memory=False)
df_crowd_h = df_crowd[(df_crowd.periode == "uur")].reset_index().drop(columns = "index").dropna()
df_cr = df_crowd_h.loc[df_crowd_h['gebied'].isin(["Winkelgebied centrum", "Wallen", "Albert Cuypstraat", "Vondelpark", "IJ-veren"])]
df_cr_2 = df_cr.loc[~df_cr['naam_locatie'].isin(["Museumplein Noord", "Museumplein Zuid", "CS - ri. Buiksloterweg (Oost)", "CS - ri. IJPlein", "Pontsteiger Z (ri. Distelweg)" , "Bloedstraat","Nieuwebrugsteeg", "Oudezijds Voorburgwal t.h.v. 107", "Oudezijds Achterburgwal t.h.v. 116", "Stormsteeg t.h.v. 3", "Kalverstraat t.h.v. 1", "Rokin 66", "Dam-zone"])]
df_cr_2['geometry_2'] = df_cr_2['geometry'].to_crs(4326)
df_cr_2['lat'] = df_cr_2.geometry_2.y
df_cr_2['lon'] = df_cr_2.geometry_2.x
fname = "https://maps.amsterdam.nl/open_geodata/geojson_lnglat.php?KAARTLAAG=GEBIEDEN22&THEMA=gebiedsindeling"
df_borough = gpd.read_file(fname,low_memory=False)
df_borough = df_borough.drop(columns = ["Gebied_code", "Stadsdeel_code","Opp_m2"]).rename(columns={"Gebied": "borough_name"})
df_borough = df_borough[df_borough["borough_name"] != "Westpoort"]
df_borough['area'] = np.zeros(len(df_borough))
df_borough.loc[df_borough['borough_name'] == "Bijlmer-Oost", 'area'] = "Zuidoost"
df_borough.loc[df_borough['borough_name'] == "Geuzenveld, Slotermeer, Sloterdijken", 'area'] = "Nieuw-West"
df_borough.loc[df_borough['borough_name'] == "Osdorp", 'area'] = "Nieuw-West"
df_borough.loc[df_borough['borough_name'] == "De Aker, Sloten, Nieuw-Sloten", 'area'] = "Nieuw-West"
df_borough.loc[df_borough['borough_name'] == "Slotervaart", 'area'] = "Nieuw-West"
df_borough.loc[df_borough['borough_name'] == "Bos en Lommer", 'area'] = "West"
df_borough.loc[df_borough['borough_name'] == "Noord-West", 'area'] = "Noord"
df_borough.loc[df_borough['borough_name'] == "Westerpark", 'area'] = "West"
df_borough.loc[df_borough['borough_name'] == "Oud West, De Baarsjes", 'area'] = "West"
df_borough.loc[df_borough['borough_name'] == "Oud-Zuid", 'area'] = "Zuid"
df_borough.loc[df_borough['borough_name'] == "Buitenveldert, Zuidas", 'area'] = "Zuid"
df_borough.loc[df_borough['borough_name'] == "Centrum-West", 'area'] = "Centrum"
df_borough.loc[df_borough['borough_name'] == "Centrum-Oost", 'area'] = "Centrum"
df_borough.loc[df_borough['borough_name'] == "Oud-Noord", 'area'] = "Noord"
df_borough.loc[df_borough['borough_name'] == "Noord-Oost", 'area'] = "Noord"
df_borough.loc[df_borough['borough_name'] == "IJburg, Zeeburgereiland", 'area'] = "Oost"
df_borough.loc[df_borough['borough_name'] == "Indische Buurt, Oostelijk Havengebied", 'area'] = "Oost"
df_borough.loc[df_borough['borough_name'] == "Oud-Oost", 'area'] = "Oost"
df_borough.loc[df_borough['borough_name'] == "De Pijp, Rivierenbuurt", 'area'] = "Zuid"
df_borough.loc[df_borough['borough_name'] == "Watergraafsmeer", 'area'] = "Oost"
df_borough.loc[df_borough['borough_name'] == "Bijlmer-Centrum, Amstel III", 'area'] = "Zuidoost"
df_borough.loc[df_borough['borough_name'] == "Gaasperdam, Driemond", 'area'] = "Zuidoost"
def get_area_name(pnt:Point):
for i,j in enumerate(df_borough.geometry):
if pnt.within(j):
return df_borough["borough_name"].iloc[i]
get_area_name = np.vectorize(get_area_name)
df_cr_2['borough_name'] = pd.Series(get_area_name(df_cr_2["geometry_2"].values))
df_borough = df_borough.drop(columns = ["geometry"])
df_cr_3 = df_cr_2.drop(columns = ["id", "sensor", "periode", "geometry", "geometry_2"])
df_crowd_year = df_cr_3[(df_cr_3.datum_uur >= "2021-03-23 15:00:00+00:00") & (df_cr_3.datum_uur <= "2021-09-30 11:00:00+0000")]
df_crowd_year["datum_uur"] = pd.to_datetime(df_crowd_year["datum_uur"])
df_yearly = df_crowd_year.copy()
df_yearly["datum_uur"] = pd.to_datetime(df_yearly["datum_uur"])
df_yearly = df_yearly.set_index('datum_uur')
df_yearly['year'] = df_yearly.index.year
df_yearly['month'] = df_yearly.index.month
df_yearly['hour'] = df_yearly.index.hour
df_yearly['weekday_name'] = df_yearly.index.day_name()
df_yearly['day'] = df_yearly.index.day
df_yearly = df_yearly.reset_index().drop(columns = [ "datum_uur"])
df_yearly_new = df_yearly.groupby(["naam_locatie", "gebied", "lat", "lon", "year", "month", "hour", "weekday_name", "day"]).sum().reset_index()
datawe = df_yearly_new.merge(df_weather, how='inner', on=["year", "month", "hour", "weekday_name", "day"]).drop(columns= [ "year", "snow_last_hour"])
return (datawe, df_cr_2)
def yet_more_formatting(datawe_new, df_cr_2):
"""
Okay data is hard, we just love transforming it :)
"""
amenities = joblib.load('./data/osm_features.pkl')
amenities_new = amenities.drop(columns = [ "leisure_playground_count_300m", "leisure_playground_count_200m", "leisure_playground_min_distance", "leisure_sports_centre_count_200m",
"leisure_sports_centre_count_300m", "leisure_sports_centre_min_distance", "leisure_picnic_table_min_distance", "bus_yes_min_distance",
"subway_yes_min_distance", "tram_yes_min_distance", "amenity_restaurant_min_distance", "amenity_bench_min_distance", "amenity_cafe_min_distance",
"amenity_fast_food_min_distance", "amenity_bicycle_parking_min_distance", "amenity_pub_min_distance", "amenity_drinking_water_min_distance",
"amenity_waste_basket_min_distance", "amenity_charging_station_min_distance", "amenity_post_box_min_distance", "amenity_bar_min_distance",
"amenity_brothel_min_distance", "amenity_atm_min_distance", "amenity_parking_min_distance", "amenity_toilets_min_distance","amenity_school_min_distance",
"amenity_community_centre_min_distance", "tourism_artwork_min_distance", "tourism_hotel_min_distance", "tourism_information_min_distance",
"tourism_museum_min_distance","tourism_attraction_min_distance", "tourism_picnic_site_min_distance", "tourism_hostel_min_distance",
"tourism_viewpoint_min_distance", "tourism_guest_house_min_distance", "tourism_gallery_min_distance", "shop_clothes_min_distance", "shop_convenience_min_distance",
"shop_hairdresser_min_distance", "shop_supermarket_min_distance","shop_gift_min_distance","shop_bicycle_min_distance","shop_bakery_min_distance",
"shop_bicycle_count_200m", "shop_bicycle_count_300m", "shop_bakery_count_200m", "shop_bakery_count_300m", "shop_convenience_count_200m", "subway_yes_min_distance",
"shop_convenience_count_300m", "leisure_picnic_table_count_200m", "leisure_picnic_table_count_300m", "bus_yes_count_200m","bus_yes_count_300m", "shop_supermarket_count_200m",
"shop_supermarket_count_300m", "shop_gift_count_200m", "shop_gift_count_300m", "subway_yes_count_200m", "subway_yes_count_300m", "shop_hairdresser_count_200m", "shop_hairdresser_count_300m",
"tram_yes_count_200m", "tram_yes_count_300m", "amenity_restaurant_count_200m", "amenity_restaurant_count_300m", "amenity_bench_count_200m",
"amenity_bench_count_300m", "amenity_cafe_count_200m", "amenity_cafe_count_300m", "amenity_fast_food_count_200m", "amenity_fast_food_count_300m" ,
"amenity_bicycle_parking_count_200m", "amenity_bicycle_parking_count_300m", "amenity_pub_count_200m", "amenity_pub_count_300m",
"amenity_drinking_water_count_200m", "amenity_drinking_water_count_300m", "amenity_waste_basket_count_200m", "amenity_waste_basket_count_300m",
"amenity_charging_station_count_200m", "amenity_charging_station_count_300m", "amenity_post_box_count_200m", "amenity_post_box_count_300m",
"amenity_bar_count_200m", "amenity_bar_count_300m", "amenity_brothel_count_200m", "amenity_brothel_count_300m", "amenity_atm_count_200m",
"amenity_atm_count_300m", "amenity_parking_count_200m", "amenity_parking_count_300m", "amenity_toilets_count_200m", "amenity_toilets_count_300m",
"amenity_school_count_200m", "amenity_school_count_300m", "amenity_community_centre_count_200m", "amenity_community_centre_count_300m",
"amenity_community_centre_count_200m", "amenity_community_centre_count_300m", "tourism_artwork_count_200m", "tourism_artwork_count_300m",
"tourism_hotel_count_200m", "tourism_hotel_count_300m", "tourism_information_count_200m", "tourism_information_count_300m",
"tourism_museum_count_200m", "tourism_museum_count_300m", "tourism_attraction_count_200m", "tourism_attraction_count_300m",
"tourism_picnic_site_count_200m", "tourism_picnic_site_count_300m", "tourism_hostel_count_200m", "tourism_hostel_count_300m",
"tourism_viewpoint_count_200m", "tourism_viewpoint_count_300m", "tourism_guest_house_count_200m", "tourism_guest_house_count_300m",
"tourism_gallery_count_200m", "tourism_gallery_count_300m", "shop_clothes_count_200m", "shop_clothes_count_300m"])
amenities_new['lat'] = amenities.geometry.y
amenities_new['lon'] = amenities.geometry.x
data = datawe_new.merge(amenities_new, how='inner', on=['naam_locatie', "lat", "lon"])
data = data.drop(columns = ["geometry"])
df_sensors = df_cr_2[["naam_locatie", "gebied", "lat", "lon"]]
df_sensors = df_sensors.drop_duplicates(keep='last')
data_nuovo = data.merge(df_sensors, how='inner', on=["naam_locatie", "lat", "lon"])
centre = data_nuovo.loc[data_nuovo['naam_locatie'].isin(['Dam-total', 'Kalverstraat Zuid t.h.v.73', 'Nieuwendijk t.h.v.218', 'Oudebrugsteeg', 'Damrak 1-5', 'Korte Niezel', 'Molensteeg',
'Oudezijds Achterburgwal t.h.v. 86', 'Oudezijds Achterburgwal t.h.v. 91', 'Oudezijds Voorburgwal t.h.v. 206', 'Stoofsteeg',
'Damstraat', 'Kloveniersburgwal', 'Oudekennissteeg', 'Oudezijds Voorburgwal t.h.v. 91'])].reset_index().drop(columns = ["index", "gebied"])
centre_2 = pd.get_dummies(centre)
centre_2 = centre_2.drop(columns = ["naam_locatie_Stoofsteeg"])
centre_sel_2 = centre_2[["hour", "temp", "aantal_passanten", "humidity", "wind_direction", "wind_speed", "pressure", "tram_yes_count_100m", "bus_yes_count_100m",
"amenity_restaurant_count_100m", "amenity_bench_count_100m", "amenity_cafe_count_100m", "amenity_fast_food_count_100m",
"amenity_pub_count_100m", "amenity_waste_basket_count_100m", "amenity_charging_station_count_100m", "amenity_bar_count_100m",
"amenity_brothel_count_100m", "amenity_atm_count_100m", "amenity_toilets_count_100m", "tourism_artwork_count_100m",
"tourism_hotel_count_100m", "tourism_attraction_count_100m", "shop_clothes_count_100m", "shop_gift_count_100m",
"shop_bicycle_count_100m", "shop_bakery_count_100m", "shop_convenience_count_100m",
"naam_locatie_Damrak 1-5", "naam_locatie_Damstraat", "naam_locatie_Kalverstraat Zuid t.h.v.73"]]
X_centre = centre_2.copy()
y_centre = X_centre.pop('aantal_passanten')
return X_centre, y_centre
# --- Prophet
def run_prophet(df,max_year,df_holidays):
df_tmp = df[df.year<=max_year]
model = Prophet(
growth="linear",
n_changepoints=25,
yearly_seasonality=True,
weekly_seasonality=True,
daily_seasonality=True,
holidays=df_holidays)
model.fit(df_tmp)
forecast = model.make_future_dataframe(periods=365, include_history=False)
forecast = model.predict(forecast)
return forecast
def prophet_prep(pollution_data):
df_pm25_fb = pollution_data[["timestamp_measured","PM25"]].copy()
df_pm25_fb["year"] = df_pm25_fb["timestamp_measured"].apply(lambda x: x.year)
df_pm25_fb = df_pm25_fb.rename(columns = {"timestamp_measured":"ds","PM25":"y"})
# remove timezone
df_pm25_fb["ds"] = df_pm25_fb.ds.apply(lambda x:x.tz_localize(None))
h_dict = holidays.NL(years = 2016)+holidays.NL(years = 2017)
df_holiday_2016 = pd.DataFrame.from_dict(data=h_dict,orient="index").reset_index().rename(columns={"index":"ds",0:"holiday"})
return (df_pm25_fb,df_holiday_2016)
# --- traffic data
def load_dataframes_traffic_numbers(data_folder):
dataframes={}
files = os.listdir(data_folder)
for x in os.listdir(data_folder):
x_path = data_folder+x
path = r'{}'.format(x_path)
key = x
dataframes[key] = pd.read_excel(path, sheet_name="Intensiteit", header=1)
return(dataframes, files)
def clean_data(df):
df.columns = df.iloc[2]
df = df.drop([df.index[0]])
df = df.rename(columns={
"uur op de dag":"time_of_day",
"onbepaald (%)":"unknown_size",
"tussen 1,85 m en 2,40 m (%)":"small_car",
"tussen 2,40 m en 5,60 m (%)":"medium_car",
"tussen 5,60 m en 11,50 m (%)":"big_car",
"tussen 11,50 m en 12,20 m (%)":"small_lorry",
"groter dan 12,20 m (%)":"big_lorry"
}
)
df = df[df.time_of_day != "Totaal"]
df = df[df.time_of_day.notna()]
df = df.reset_index()
sensor_string = df.time_of_day.loc[df["time_of_day"].str.contains("Gemiddelde")]
start_date = sensor_string[0].split(" ")[5]
end_date = sensor_string[0].split(" ")[8]
sensor_string = sensor_string.str.extract('.*\((.*)\).*')
sensor_string.columns = ["sensor"]
for idx, sensor in sensor_string.iterrows():
idx_end = idx+25
df.loc[idx:idx_end, "sensor"] = sensor.values[0]
df["start_date"] = start_date
df["end_date"] = end_date
df = df[df.Intensiteit.notna()]
df = df[df.Intensiteit != "Intensiteit"]
midcity_sensors = ['GAD02_Amstd_29_0', 'GAD02_Amstd_30_0', 'GAD02_Amstd_33_0',
'GAD02_Amstd_33_2', 'GAD02_Amstd_34_0', 'GAD02_Amstd_34_2']
df.loc[df['sensor'].isin(midcity_sensors), "sensor_type"] = "midcity"
columns=("small_car", "medium_car","big_car","small_lorry","big_lorry","unknown_size")
def round_up_car_numbers(df, columns):
for column in columns:
column_name = column+"_numbers"
df[column_name] = np.ceil((df[column]/100)*df.Intensiteit)
return(df)
df = round_up_car_numbers(df, columns)
return(df)
def unzip_folder(zip_dir, output_dir):
zip_files = os.listdir(zip_dir)
for zip_file in zip_files:
with zipfile.ZipFile(output_dir+zip_file, 'r') as zip_ref:
zip_ref.extractall(output_dir)
def character_cleanup(df):
df=df.replace('\*','', regex=True)
df=df.replace(',','.', regex=True)
df=df.replace("Totaal bestelauto's","total_vans", regex=True)
df=df.replace("Bestelauto's van particulieren","private_vans", regex=True)
df=df.replace("Bestelauto's van bedrijven","company_vans", regex=True)
return(df)
def van_age_data_processing(filename):
df_raw = pd.read_csv(filename,sep=";")
df = df_raw.rename(
columns={"Hoofdgebruiker bestelauto":"main_user",
"Perioden":"years",
"Regio's":"region",
"Gemiddelde leeftijd bestelauto's (jaren)":"avg_age_vans_years",
"Bestelauto's naar leeftijdsklasse/Alle leeftijdsklassen (aantal)":"number_of_vans_in_class"}
)
df = character_cleanup(df)
df[["years", "avg_age_vans_years","number_of_vans_in_class"]] = df[["years", "avg_age_vans_years","number_of_vans_in_class"]].apply(pd.to_numeric)
return(df)
# pollution
pollution_data = get_timeseries_dataframe()
# Weather
historic_data, weather_data = get_historic_weather(4.888062,52.359214,
'2021-04-01 12:00:00',
'2022-02-01 12:00:00')
# Traffic forecast
road_location,sensor_location,rijksmuseum_location = get_traffic_forecast()
# Our own survey data
(survey, df_demo_super,df_compare_plot) = get_and_process_survey_data()
print("User survey with",len(survey),"people")
!ls ./data/amsterdam.osh.pbf
fname = "https://api.data.amsterdam.nl/v1/wfs/crowdmonitor/?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&TYPENAMES=passanten&OUTPUTFORMAT=geojson"
df_crowd = gpd.read_file(fname,low_memory=False)
df_crowd['geometry'] = df_crowd['geometry'].to_crs(4326)
important_sensors = ['Korte Niezel','Oudekennissteeg','Stoofsteeg','Oudezijds Voorburgwal t.h.v. 91','Oudezijds Achterburgwal t.h.v. 86','Oudezijds Achterburgwal t.h.v. 91','Oudezijds Voorburgwal t.h.v. 206','Molensteeg','Oudebrugsteeg','Damstraat','Kloveniersburgwal','Bloedstraat','Nieuwebrugsteeg','Oudezijds Voorburgwal t.h.v. 107','Oudezijds Achterburgwal t.h.v. 116','Stormsteeg t.h.v. 3','Stadhouderskade','Museumplein Noord','Museumplein Zuid','Amstelveenseweg','Kalverstraat t.h.v. 1','Kalverstraat Zuid t.h.v.73','Nieuwendijk t.h.v.218','Rokin 66','Damrak 1-5','Dam-total','Dam-zone','Albert Cuypstraat']
pedestrian_df = df_crowd.groupby('naam_locatie').first()[['geometry']].reset_index()
pedestrian_df = pedestrian_df[pedestrian_df['naam_locatie'].isin(important_sensors)]
#return pedestrian_df.set_crs('EPSG:4326')
def load_pedestrian_data():
return pedestrian_df.set_crs('EPSG:4326')
# geospactial + Pedestrians
# tags, pedestrian_df, stadhouderskade_df, raw_tags = calculate_features_raw()
# clusters for pedestrians and amenties
map_clusters, c_types = prepare_and_cluster()
# pedestrian modelling
#df_weather = prep_weather(historic_data, weather_data)
(datawe, df_cr_2)= prep_pedestrians(df_weather)
# traffic data
filename = "./data/data_traffic_van/van_average_lifetime_numbers_per_region_2009_2020.csv"
df_van =van_age_data_processing(filename)
# more traffic data
data_folder = "./data/data_traffic_numbers_2021_2022/"
dataframes, files = load_dataframes_traffic_numbers(data_folder)
df =clean_data(dataframes[files[0]])
for file in range(1, len(files)):
df_temp=clean_data(dataframes[files[file]])
df=df.append(df_temp)
df["lorries_numbers"] = df.small_lorry_numbers+df.big_lorry_numbers
df_sensor = df.copy()
# Prophet predict
(df_pm25_fb,df_holiday_2016) = prophet_prep(pollution_data)
df_2017 = run_prophet(df_pm25_fb,2016,df_holiday_2016)
# Plot prophet result
plt.figure()
ax=plt.gca()
#df_pm25_fb[df_pm25_fb.year==2017].plot(x="ds",y="y",label="y_true",ax=ax,alpha=.5)
df_2017.plot(x="ds",y="yhat",style="-r",linewidth=2,label="y_fit",ax=ax)
df_pm25_fb[df_pm25_fb.year==2017].plot(x="ds",y="y",label="y_true",ax=ax,alpha=.5)
df_pm25_fb[df_pm25_fb.year==2016].plot(x="ds",y="y",label="y_last_year",ax=ax,alpha=.5)
plt.ylim(0,200)
plt.title("PM 2.5 µm fbProfet predict\n(only pick up winter seasonality)",size=20)
# join data (use weather data as limit as we have 1 year back )
wd = weather_data.copy()
hd = historic_data.copy()
format_time(wd,timecol="dt")
format_time(hd,timecol="dt")
df_joined = pd.merge(wd, pollution_data,
left_on='dt',
right_on='timestamp_measured',
how='left'
)
df_joined = pd.merge(df_joined, hd,
left_on='dt',
right_on='dt',
how='left'
)
df_joined.drop(['id','timestamp_measured'], axis=1, inplace=True)
df_joined = pd.get_dummies(df_joined)
# focus on PM25
# detect candidate lag features and add them
lags_dict_PM25 = analyse_for_lags(df_joined,'PM25',48,threshold=.3)
df_joined_PM25 = add_lag_features(df_joined,lags_dict_PM25)
# Model and cal values
target = "PM25"
model,df_train, df_test = train_xgb_regressor(df_joined.drop('dt', axis=1) ,target=target)
plot_feature_graphs(model,df_train.drop(target, axis=1),plot_all=False)
# focus on NO2
lags_dict_NO2 = analyse_for_lags(df_joined,'NO2',48,threshold=.3)
df_joined_NO2 = add_lag_features(df_joined,lags_dict_PM25,min_lags=1)
target = "NO2"
model,df_train, df_test = train_xgb_regressor(df_joined_NO2.drop('dt', axis=1) ,target=target)
plot_feature_graphs(model,df_train.drop(target, axis=1),plot_all=False)
# Evaluation
print('Training score: ', model.score(df_train.drop(labels=[target], axis = 1).fillna(0), df_train[target].fillna(0)))
print('Test score: ', model.score(df_test.drop(labels=[target], axis = 1).fillna(0), df_test[target].fillna(0)))
- Visualize associations and correlation to remove duplicate variables
- Do a random - hyper parameter optimization
datawe_for_assoc_and_corr = datawe.copy()
categorical_cols = ["naam_locatie", "gebied", "weekday_name", "weather", "weather_description", "area"]
results = associations(datawe_for_assoc_and_corr, nominal_columns = categorical_cols, return_results=True)
## Similar plots - we also used to select features.
# corr_matrix = datawe_for_corr.corr()
# sns.heatmap(corr_matrix, vmax=.7, annot = True, square=True,cmap=sns.diverging_palette(220, 10, as_cmap=True))
# spearman = datawe_for_corr.corr(method='spearman').round(2)
# sns.heatmap(spearman, vmax=.8, annot = True, square=True,cmap=sns.diverging_palette(220, 10, as_cmap=True))
# Columns to keep
datawe_new = datawe[["naam_locatie", "aantal_passanten", "hour", "month", "day", "temp", "pressure", "humidity", "wind_speed", "wind_direction", "lat", "lon"]]
X_centre, y_centre = yet_more_formatting(datawe_new, df_cr_2)
#train test split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_centre, y_centre, test_size=0.2, random_state=1)
num_var = X_train_c.shape[1]
# Getting the names of the features
features_list = list(X_train_c.columns)
label = np.ravel(y_train_c)
# Hyper parameter tuning
param_grid={'max_depth': [5, 10, 20, 50, 60, 100, None],
'n_estimators': [5, 10, 20, 50, 60, 100],
'min_samples_leaf': [5, 15, 50, 70, 100],
'max_features' : ['log2', 'sqrt', (num_var/3)]}
rfgrid = RandomForestRegressor(random_state = 2019, bootstrap = True)
gsc = GridSearchCV(estimator = rfgrid, param_grid = param_grid, cv=3, scoring='neg_mean_squared_error', verbose=10, n_jobs=-1)
grid_result = gsc.fit(X_train_c, label)
print(grid_result.best_params_)
rf_best = RandomForestRegressor(random_state = 2019, bootstrap = True, criterion='mse', max_depth = 50 , min_samples_leaf = 5, max_features = 'sqrt', n_estimators =60)
rf_best.fit(X_train_c, label)
# Evaluation
print('Training score: ', rf_best.score(X_train_c, y_train_c))
print('Test score: ', rf_best.score(X_test_c, y_test_c))
print('CV score: ', (cross_val_score(rf_best, X_train_c, y_train_c)).mean())
# Feature importance
predictions = rf_best.predict(X_test_c)
residuals = pd.DataFrame(predictions, y_test_c)
residuals.reset_index(inplace = True)
residuals.rename({'aantal_passanten': 'y_test_c', 0: 'predictions'}, axis = 1, inplace = True)
residuals['residuals'] = residuals.y_test_c - residuals.predictions
feature_importance = pd.DataFrame(list(zip(X_train_c.columns,rf_best.feature_importances_))).sort_values(by=1,ascending=True)
feature_importance.rename({1: 'Feature Importance', 0: 'Feature'}, axis = 1, inplace = True)
# plot it
sns.set(font_scale = 2)
fig, ax = plt.subplots()
fig.set_size_inches(16, 12)
sns.barplot(x='Feature Importance', y='Feature', data=feature_importance[-30:], orient='h', palette = 'rocket', saturation=0.7)
ax.set_title("Feature Importance", fontsize=40, y=1.01)
ax.set_xlabel('Importance', fontsize = 30)
ax.set_ylabel('Feature', fontsize = 30)
fig.savefig('location_feature_importance.png')
# 2015 traffic
ax = road_location.plot("etmaal_2015",figsize=(7,3), cmap="hot",legend=True)
ctx.add_basemap(ax)
plt.title("Abs traffic 2015",fontsize=20)
ax = sensor_location.plot(edgecolor='k',color='pink',markersize=100,ax=ax,label="Sensor")
ax = rijksmuseum_location.plot(edgecolor='k',color='lime',markersize=100,ax=ax,label="Rijksmuseum")
plt.legend()
# predicted growth 2015-2030
ax = road_location.plot("diff_rel",figsize=(7,3), cmap="hot",legend=True)
ctx.add_basemap(ax)
plt.title("%-traffic change 2015-2030",fontsize=20)
ax = sensor_location.plot(figsize=(7,7), edgecolor='k',color='pink',markersize=100,ax=ax,label="Sensor")
ax = rijksmuseum_location.plot(edgecolor='k',color='lime',markersize=100,ax=ax,label="Rijksmuseum")
plt.legend()
# preprocess
pollution_data = get_timeseries_dataframe()
pollution_data["year"] = pollution_data.timestamp_measured.apply(lambda x: x.year)
df_pollution = pollution_data.groupby("year").mean().reset_index()
# plot
ch = chartify.Chart(x_axis_type='categorical',layout="slide_25%")
ch.set_title("PM 2.5 µm drops in 2019")
ch.set_subtitle("Before Covid")
ch.plot.bar(data_frame=df_pollution,
categorical_columns=["year"],
numeric_column='PM25',
categorical_order_by='labels',
categorical_order_ascending=True)
ch.axes.set_xaxis_label("year")
ch.axes.set_yaxis_label("avg. conc. (ppm.)")
ch.show()
# ---
ch = chartify.Chart(x_axis_type='categorical',layout="slide_25%")
ch.set_title("NO2 drops in 2020 (after covid)")
ch.set_subtitle("After Covid")
ch.plot.bar(data_frame=df_pollution,
categorical_columns=["year"],
numeric_column='NO2',
categorical_order_by='labels',
categorical_order_ascending=True)
ch.axes.set_xaxis_label("year")
ch.axes.set_yaxis_label("avg. conc. (ppm.)")
ch.show()
# Wind direction plot
count_, direction_ = np.histogram(historic_data.wind_direction,bins=24,weights=historic_data.wind_speed)
N = len(count_)
bottom = 8
max_height = 4
theta = 2*np.pi*direction_[:-1]/360
radii = count_
width = (2*np.pi) / N
ax = plt.subplot(111, polar=True)
ax.invert_xaxis()
#ax.set_rlim(bottom=90, top=-45)
bars = ax.bar(theta, radii, width=width, bottom=bottom)
# Use custom colors and opacity
for r, bar in zip(radii, bars):
bar.set_facecolor(plt.cm.jet(r / 10000.))
bar.set_alpha(0.8)
ax.set_theta_zero_location("N")
ax.set_theta_direction(-1)
ax.set_title("Avg. Wind direction\n(weighted by speed)",fontsize=20)
ax.set(yticklabels=[])
plt.show()
# age distribution
ch = chartify.Chart(y_axis_type='categorical',layout="slide_50%")
ch.set_title("Age distribution")
ch.set_subtitle("demography vs survey")
ch.plot.lollipop(data_frame=df_demo_super,
categorical_columns=["age","type"],
numeric_column='percent',
categorical_order_by='labels',
categorical_order_ascending=False
,color_column="type")
ch.axes.set_yaxis_label("")
ch.axes.set_xaxis_label("population percent ")
ch.axes.set_yaxis_tick_orientation('horizontal')
ch.set_source_label("source: https://www.citypopulation.de/"+70*" ")
ch.set_legend_location('outside_bottom')
ch.show()
# Top attractive features
ch = chartify.Chart(y_axis_type='categorical',layout="slide_50%")
ch.set_title("Public spaces")
ch.set_subtitle("Top attractive feature ")
ch.plot.bar(data_frame=df_compare_plot,
categorical_columns=['top3','type'],
numeric_column='survey',
categorical_order_ascending=True,
color_column="type")
ch.axes.set_yaxis_label("")
ch.axes.set_xaxis_label("population percent ")
ch.axes.set_yaxis_tick_orientation('horizontal')
ch.set_legend_location(None)
ch.show()
# favourites places
favourite_percent = 100*survey["favourite_place"].str.replace(' ','').str.lower().value_counts()/len(survey["favourite_place"].dropna())
favourite_park_percent = 100*pd.Series(['park' if 'park' in i else 'other' for i in survey["favourite_place"].dropna().values]).value_counts()/len(survey["favourite_place"].dropna())
print("== Favourite places, percent ==")
display(favourite_percent)
print("-"*40)
print("== Park vs non park, percent ==")
display(favourite_park_percent)
map_clusters
c_types
data_folder = "./data/data_traffic_numbers_2021_2022/"
dataframes, files = load_dataframes_traffic_numbers(data_folder)
df=clean_data(dataframes[files[0]])
for file in range(1, len(files)):
df_temp=clean_data(dataframes[files[file]])
df=df.append(df_temp)
df["lorries_numbers"] = df.small_lorry_numbers+df.big_lorry_numbers
plt.rcParams.update({'font.size':16})
van_car_pollution=pd.read_csv("./data/data_traffic_van/van_car_pollution.csv")
van_car_type=van_car_pollution.groupby("type")
fig, ax = plt.subplots(figsize=(4, 3))
for name, type in van_car_type:
ax.plot(type["year"], type["gCO2_pr_km"], label=name, linewidth=3)
ax.legend(loc="lower right")
plt.title("New vehicles - gCO2/km pollution")
ax.set(ylim=(0, 220))
plt.xlabel("year")
plt.ylabel("gram CO2 / km")
sensor1 = df_sensor
# Numbers of vehicles in Amsterdam, by type
vehicle_numbers=[sensor1.small_car_numbers.sum(),
sensor1.medium_car_numbers.sum(),
sensor1.big_car_numbers.sum(),
sensor1.lorries_numbers.sum()]
vehicle_numbers_total=sum(vehicle_numbers)
vehicle_ratios=[
vehicle_numbers[0]/vehicle_numbers_total*100,
vehicle_numbers[1]/vehicle_numbers_total*100,
vehicle_numbers[2]/vehicle_numbers_total*100,
vehicle_numbers[3]/vehicle_numbers_total*100]
fig, ax = plt.subplots(figsize=(10, 5), constrained_layout=True)
plt.pie(vehicle_ratios, labels=["small cars","medium cars","vans and big cars","lorries"])
df = df_van
regions=df[df.main_user=="total_vans"].groupby("region")
fig, ax = plt.subplots()
for name, region in regions:
ax.plot(region["years"], region["avg_age_vans_years"], label=name,linewidth=3)
ax.legend(loc="best")
plt.title("Van age\n(vans getting older)")
plt.ylabel("Avg. van age (years)")
plt.xlabel("year")
fig.autofmt_xdate()
regions=df[df.region=="Amsterdam"].groupby("main_user")
fig, ax = plt.subplots(figsize=(5, 4))
for name, region in regions:
ax.plot(region["years"], region["avg_age_vans_years"], label=name, linewidth=3)
ax.legend(loc="best")
plt.title("Vans spit by ownership\n(Privates don't get new)")
plt.xlabel("year")
plt.ylabel("Avg. van age (years)")
fig.autofmt_xdate()
regions=df[df.region=="Amsterdam"].groupby("main_user")
fig, ax = plt.subplots()
for name, region in regions:
ax.plot(region["years"], region["number_of_vans_in_class"], label=name,linewidth=3)
ax.legend(loc="best")
plt.title("Van numbers Amsterdam\n(private vans dropping)")
plt.xlabel("year")
plt.ylabel("Num. vans")
fig.autofmt_xdate()
!pip freeze